import pandas as pd
import numpy as np
import nltk
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df = pd.read_csv("data job posts.csv")
df=df[df['Title'].isnull()==False]
df.reset_index(inplace=True)
df.head(5)
# Get the basic info of DataFrame and perform programmatic assessment
df.info()
#Finad the job nature and store in variable x
X=df['Title']
len(X)
import re
corpus = []
for i in range(0, len(X)):
review = re.sub(r'\W', ' ', str(X[i]))
review = re.sub(r'^br$', ' ', review)
review = re.sub(r'\s+[a-z]\s+', ' ',review)
review = re.sub(r'^[a-z]\s+', ' ', review)
review = re.sub(r'\s+', ' ', review)
corpus.append(review)
# Remove punctuations from all title
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(corpus))
data_words[0]
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','na','Senior','new','branch','Junior','unit','department','Specialist','the','unit'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])
data_lemmatized[0:10]
# few null value is also coming so we are going to remove null from list
list2 = [x for x in data_lemmatized if x != []]
for i in range(len(list2)):
list2[i] = ' '.join(list2[i])
list2[0:5]
titlecount = {}
for data in list2:
words = nltk.word_tokenize(data)
for word in words:
if word not in titlecount.keys():
titlecount[word] = 1
else:
titlecount[word] += 1
len(titlecount)
import heapq
# Selecting best 100 features
freq_words = heapq.nlargest(1000,titlecount,key=titlecount.get)
%%capture
#gather features
text = " ".join(freq_words)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
for i in range(len(data_lemmatized)):
data_lemmatized[i] = ' '.join(data_lemmatized[i])
date_field=df['Year'].tolist()
len(date_field)
Job_year = pd.DataFrame(np.column_stack([data_lemmatized,date_field]),
columns=['Job_title','Year'])
Job_year.head(5)
Job_year.dtypes
#Converting year to numeric value
Job_year['Year']=Job_year['Year'].astype('int')
Job_year.dtypes
Job_year.Year.value_counts()
#We will devide year into 3 equidistant bins to find the job nature over the period
Job_year['Year_bins']=pd.cut(Job_year['Year'],3,labels=['Period1','Period2','Period3'])
Job_year.pivot_table(values='Year',index='Year_bins',aggfunc=['min','max','count'])
X1=Job_year[Job_year['Year_bins']=='Period1'].iloc[:,0]
# Selecting top 500 jobs for the wordcloud
titlecount = {}
for data in X1:
words = nltk.word_tokenize(data)
for word in words:
if word not in titlecount.keys():
titlecount[word] = 1
else:
titlecount[word] += 1
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
%%capture
#gather features
text = " ".join(freq_words)
# Wordcloud need to draw for nature of job
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
X2=Job_year[Job_year['Year_bins']=='Period2'].iloc[:,0]
# Selecting top 500 jobs for the wordcloud
titlecount = {}
for data in X2:
words = nltk.word_tokenize(data)
for word in words:
if word not in titlecount.keys():
titlecount[word] = 1
else:
titlecount[word] += 1
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
%%capture
#gather features
text = " ".join(freq_words)
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
X3=Job_year[Job_year['Year_bins']=='Period3'].iloc[:,0]
# Selecting top 500 jobs for the wordcloud
titlecount = {}
for data in X3:
words = nltk.word_tokenize(data)
for word in words:
if word not in titlecount.keys():
titlecount[word] = 1
else:
titlecount[word] += 1
# import heapq
freq_words = heapq.nlargest(500,titlecount,key=titlecount.get)
%%capture
#gather features
text = " ".join(freq_words)
wordcloud = WordCloud(stopwords=[],max_font_size=60).generate(text)
plt.figure(figsize=(16,12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
# Defining X
df['X1'] = df['Title'].str.cat(df['JobRequirment'], sep =" ").str.cat(df['RequiredQual'], sep =" ")
X=df['X1']
# Defining Y
di={False:0,True:1}
df['IT_y']=df['IT'].map(di)
y=df['IT_y']
# Creating the corpus
import re
corpus = []
for i in range(0, len(X)):
review = re.sub(r'\W', ' ', str(X[i]))
review = review.lower()
review = re.sub(r'^br$', ' ', review)
review = re.sub(r'\s+[a-z]\s+', ' ',review)
review = re.sub(r'^[a-z]\s+', ' ', review)
review = re.sub(r'\s+', ' ', review)
corpus.append(review)
corpus[0]
from nltk.stem import PorterStemmer
#sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()
# Stemming
for i in range(len(corpus)):
words = nltk.word_tokenize(corpus[i])
words = [stemmer.stem(word) for word in words]
corpus[i] = ' '.join(words)
corpus[0]
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 0.05, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
X.shape
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(X,
y,
test_size=.3,
random_state=42)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit( train_x, train_y )
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,logreg.predict(test_x))
print('test_accuracy: ',test_accuracy)
train_accuracy=metrics.accuracy_score(train_y,logreg.predict(train_x))
print('train_accuracy: ',train_accuracy)
print('AUC train :',metrics.roc_auc_score(train_y,logreg.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,logreg.predict(test_x)))
# Creating a confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(test_y,
logreg.predict(test_x), [0,1] )
cm
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
sn.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
from sklearn.metrics import classification_report
print(classification_report(test_y,logreg.predict(test_x)))
test_predicted_prob=pd.DataFrame(logreg.predict_proba(test_x))[[1]]
test_predicted_prob.columns=['prob']
actual=test_y.reset_index()
actual.drop('index',axis=1,inplace=True)
# making a DataFrame with actual and prob columns
df_test_predict = pd.concat([actual, test_predicted_prob], axis=1)
df_test_predict.columns = ['actual','prob']
df_test_predict.head()
test_roc_like_df = pd.DataFrame()
test_temp = df_test_predict.copy()
for cut_off in np.linspace(0,1,50):
test_temp['predicted'] = test_temp['prob'].apply(lambda x: 0 if x < cut_off else 1)
test_temp['tp'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==1 else 0, axis=1)
test_temp['fp'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==1 else 0, axis=1)
test_temp['tn'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==0 else 0, axis=1)
test_temp['fn'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==0 else 0, axis=1)
sensitivity = test_temp['tp'].sum() / (test_temp['tp'].sum() + test_temp['fn'].sum())
specificity = test_temp['tn'].sum() / (test_temp['tn'].sum() + test_temp['fp'].sum())
accuracy=(test_temp['tp'].sum()+test_temp['tn'].sum()) / (test_temp['tp'].sum() + test_temp['fn'].sum()+test_temp['tn'].sum() + test_temp['fp'].sum())
test_roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity,accuracy]).T
test_roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity','accuracy']
test_roc_like_df = pd.concat([test_roc_like_df, test_roc_like_table], axis=0)
test_roc_like_df.head()
test_temp.sum()
plt.subplots(figsize=(10,4))
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['sensitivity'], marker='*', label='Sensitivity')
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['specificity'], marker='*', label='Specificity')
#plt.scatter(test_roc_like_df['cutoff'], 1-test_roc_like_df['specificity'], marker='*', label='FPR')
plt.title('For each cutoff, pair of sensitivity and FPR is plotted for ROC')
plt.legend()
#Finding ideal cut-off for checking if this remains same in OOS validation
test_roc_like_df['total'] = test_roc_like_df['sensitivity'] + test_roc_like_df['accuracy']
test_roc_like_df[test_roc_like_df['total']==test_roc_like_df['total'].max()]
df_test_predict['predicted'] = df_test_predict['prob'].apply(lambda x: 1 if x > 0.122449 else 0)
import seaborn as sns
sns.heatmap(pd.crosstab(df_test_predict['actual'], df_test_predict['predicted']), annot=True, fmt='.0f')
accuracy=metrics.accuracy_score(df_test_predict.actual, df_test_predict.predicted)
print('Accuracy: ',round(accuracy,2))
from sklearn.metrics import classification_report
print(classification_report(df_test_predict.actual, df_test_predict.predicted))
from sklearn.naive_bayes import GaussianNB
nb_clf=GaussianNB()
nb_clf.fit(train_x,train_y)
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,nb_clf.predict(test_x))
print('test_accuracy: ',test_accuracy)
train_accuracy=metrics.accuracy_score(train_y,nb_clf.predict(train_x))
print('train_accuracy: ',train_accuracy)
print('AUC train :',metrics.roc_auc_score(train_y,nb_clf.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,nb_clf.predict(test_x)))
from sklearn.metrics import classification_report
print(classification_report(test_y,nb_clf.predict(test_x)))
# Creating a confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(test_y,
nb_clf.predict(test_x), [0,1] )
cm
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
sn.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid={'n_estimators':[100, 200, 400, 600, 800]}
tree=GridSearchCV(RandomForestClassifier(oob_score=False,warm_start=True),param_grid,cv=5,n_jobs=-1)
tree.fit(train_x,train_y)
tree.best_params_
radm_clf=RandomForestClassifier(oob_score=True,n_estimators=800,n_jobs=-1)
radm_clf.fit(train_x,train_y)
#Predicting the test cases
from sklearn import metrics
test_accuracy=metrics.accuracy_score(test_y,radm_clf.predict(test_x))
print('test_accuracy: ',test_accuracy)
train_accuracy=metrics.accuracy_score(train_y,radm_clf.predict(train_x))
print('train_accuracy: ',train_accuracy)
print('AUC train :',metrics.roc_auc_score(train_y,radm_clf.predict(train_x)))
print('AUC test :',metrics.roc_auc_score(test_y,radm_clf.predict(test_x)))
from sklearn.metrics import classification_report
print(classification_report(test_y,radm_clf.predict(test_x)))
# Creating a confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(test_y,
radm_clf.predict(test_x), [0,1] )
cm
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
sn.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
test_predicted_prob=pd.DataFrame(radm_clf.predict_proba(test_x))[[1]]
test_predicted_prob.columns=['prob']
actual=test_y.reset_index()
actual.drop('index',axis=1,inplace=True)
# making a DataFrame with actual and prob columns
df_test_predict = pd.concat([actual, test_predicted_prob], axis=1)
df_test_predict.columns = ['actual','prob']
df_test_predict.head()
test_roc_like_df = pd.DataFrame()
test_temp = df_test_predict.copy()
for cut_off in np.linspace(0,1,50):
test_temp['predicted'] = test_temp['prob'].apply(lambda x: 0 if x < cut_off else 1)
test_temp['tp'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==1 else 0, axis=1)
test_temp['fp'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==1 else 0, axis=1)
test_temp['tn'] = test_temp.apply(lambda x: 1 if x['actual']==0 and x['predicted']==0 else 0, axis=1)
test_temp['fn'] = test_temp.apply(lambda x: 1 if x['actual']==1 and x['predicted']==0 else 0, axis=1)
sensitivity = test_temp['tp'].sum() / (test_temp['tp'].sum() + test_temp['fn'].sum())
specificity = test_temp['tn'].sum() / (test_temp['tn'].sum() + test_temp['fp'].sum())
accuracy=(test_temp['tp'].sum()+test_temp['tn'].sum()) / (test_temp['tp'].sum() + test_temp['fn'].sum()+test_temp['tn'].sum() + test_temp['fp'].sum())
test_roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity,accuracy]).T
test_roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity','accuracy']
test_roc_like_df = pd.concat([test_roc_like_df, test_roc_like_table], axis=0)
test_roc_like_df.head(5)
test_temp.sum()
plt.subplots(figsize=(10,4))
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['sensitivity'], marker='*', label='Sensitivity')
plt.scatter(test_roc_like_df['cutoff'], test_roc_like_df['specificity'], marker='*', label='Specificity')
#plt.scatter(test_roc_like_df['cutoff'], 1-test_roc_like_df['specificity'], marker='*', label='FPR')
plt.title('For each cutoff, pair of sensitivity and FPR is plotted for ROC')
plt.legend()
## Finding ideal cut-off for checking if this remains same in OOS validation
test_roc_like_df['total'] = test_roc_like_df['sensitivity'] + test_roc_like_df['specificity']
test_roc_like_df[test_roc_like_df['total']==test_roc_like_df['total'].max()]
df_test_predict['predicted'] = df_test_predict['prob'].apply(lambda x: 1 if x > 0.204082 else 0)
import seaborn as sns
sns.heatmap(pd.crosstab(df_test_predict['actual'], df_test_predict['predicted']), annot=True, fmt='.0f')
accuracy=metrics.accuracy_score(df_test_predict.actual, df_test_predict.predicted)
print('Accuracy: ',round(accuracy,2))
from sklearn.metrics import classification_report
print(classification_report(df_test_predict.actual, df_test_predict.predicted))
train_x.shape
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
title=list2
title[0:5]
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 0.01, max_df = 0.9, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(title).toarray()
X.shape
cluster_range = range( 1, 21 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters )
clusters.fit(X)
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:21]
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
from sklearn import metrics
# calculate SC for K=3 through K=12
k_range = range(2, 21)
scores = []
for k in k_range:
km = KMeans(n_clusters=k, random_state=1)
km.fit(X)
scores.append(metrics.silhouette_score(X, km.labels_))
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print